import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import sklearn
from prophet import Prophet
from scipy.stats import mode
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)
The second dataset, METEO, contains hourly information from The Weather Company weather stations in the area of influence in the period from 29-06-2015 to 30-06-2022, it has the following variables:
• validTimeUtc: Date
• precip1Hour: Rain volume in the last hour
• precip6Hour: Rain volume in the last 6 hours
• precip24Hour: Rain volume in the last 24 hours
• precip2Day: Rain volume in the last 2 days
• precip3Day: Rain volume in the last 3 days
• precip7Day: Rain volume in the last 7 days
• precipMtd: Rain volume in the current month
• precipYtd: Rain volume in the current year
• pressureChange: Maximum variation in atmospheric pressure in the last 3 hours
• pressureMeanSeaLevel: Barometric difference with respect to sea level
• relativeHumidity: Relative humidity
• snow1Hour: Snow volume in the last hour
• snow6Hour: Snow volume in the last 6 hours
• snow24Hour: Snow volume in the last 24 hours
• snow2Day: Snow volume in the last 2 days
• snow3Day: Snow volume in the last 3 days
• snow7Day: Snow volume in the last 7 days
• snowMtd: Snow volume in the current month
• snowSeason: Quarterly snow volume (DIC-FEB, MAR-MAY, JUN-AGO, SEP-NOV)
• snowYtd: Snow volume in the current year
• temperature: Ambient temperature at 2 meters above the ground
• temperatureChange24Hour: Temperature variation from previous day
• temperatureMax24Hour: Maximum temperature last 24 hours
• temperatureMin24Hour: Minimum temperature last 24 hours
• temperatureDewPoint: DewPoint, temperature at which the air must be cooled at constant pressure to reach saturation. The dew point is also an indirect measure of air humidity.
• temperatureFeelsLike: Thermal sensation. Apparent temperature resulting from a combination of temperature, humidity and wind speed.
• uvIndex: Ultraviolet radiation categorized: -2, -1 = not available / 0-2 = low / 3-5 = moderate / 6-7 = high / 8-10 = very high / 11-16 = extreme.
• visibility: Horizontal visibility from the weather station, 999 equals unlimited
• windDirection: Wind Direction in degrees 0 - North, 90 - East, 180 - South, 270 - West
• windGust: Maximum wind gust velocity recorded during the observation period
• windSpeed: Wind force
• ID_ESTACION: Weather station identifier
Import and brief exploration of the dataset
METEO = pd.read_csv('./Datos_Originales/DATOS_METEO.TXT',delimiter = '|', na_values = 'NA', encoding='UTF-8', parse_dates=['validTimeUtc'])
METEO.head()
| validTimeUtc | precip1Hour | precip6Hour | precip24Hour | precip2Day | precip3Day | precip7Day | precipMtd | precipYtd | pressureChange | pressureMeanSeaLevel | relativeHumidity | snow1Hour | snow6Hour | snow24Hour | snow2Day | snow3Day | snow7Day | snowMtd | snowSeason | snowYtd | temperature | temperatureChange24Hour | temperatureMax24Hour | temperatureMin24Hour | temperatureDewPoint | temperatureFeelsLike | uvIndex | visibility | windDirection | windGust | windSpeed | ID_ESTACION | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015-06-29 16:20:00 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | -1.4 | NaN | 27.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 34.5 | -1.1 | 36.3 | 17.9 | 12.8 | 34.5 | 2.0 | 16.09 | NaN | NaN | 18.7 | 13 |
| 1 | 2015-06-29 17:20:00 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | -1.0 | NaN | 26.3 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 34.3 | -0.4 | 35.0 | 17.9 | 12.3 | 34.3 | 1.0 | 16.09 | NaN | NaN | 18.0 | 13 |
| 2 | 2015-06-29 18:20:00 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | -0.3 | NaN | 29.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 32.8 | 0.1 | 34.7 | 17.9 | 12.4 | 32.8 | 0.0 | 16.09 | NaN | NaN | 16.6 | 13 |
| 3 | 2015-06-29 19:20:00 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | 0.3 | NaN | 33.2 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 31.0 | 0.3 | 34.7 | 17.9 | 12.9 | 31.0 | 0.0 | 16.09 | NaN | NaN | 15.1 | 13 |
| 4 | 2015-06-29 20:20:00 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | 0.9 | NaN | 43.1 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 27.6 | 0.0 | 34.7 | 17.9 | 13.9 | 28.0 | 0.0 | 16.09 | NaN | NaN | 10.1 | 13 |
METEO.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1223660 entries, 0 to 1223659 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 validTimeUtc 1223660 non-null datetime64[ns] 1 precip1Hour 1223640 non-null float64 2 precip6Hour 1223140 non-null float64 3 precip24Hour 1223540 non-null float64 4 precip2Day 750600 non-null float64 5 precip3Day 750600 non-null float64 6 precip7Day 750600 non-null float64 7 precipMtd 750600 non-null float64 8 precipYtd 750600 non-null float64 9 pressureChange 1223640 non-null float64 10 pressureMeanSeaLevel 867520 non-null float64 11 relativeHumidity 1223660 non-null float64 12 snow1Hour 1223640 non-null float64 13 snow6Hour 1223140 non-null float64 14 snow24Hour 1223540 non-null float64 15 snow2Day 750600 non-null float64 16 snow3Day 750600 non-null float64 17 snow7Day 750600 non-null float64 18 snowMtd 750600 non-null float64 19 snowSeason 750600 non-null float64 20 snowYtd 750600 non-null float64 21 temperature 1223640 non-null float64 22 temperatureChange24Hour 1223280 non-null float64 23 temperatureMax24Hour 1223560 non-null float64 24 temperatureMin24Hour 1223560 non-null float64 25 temperatureDewPoint 1223640 non-null float64 26 temperatureFeelsLike 1223640 non-null float64 27 uvIndex 1223640 non-null float64 28 visibility 1223640 non-null float64 29 windDirection 867520 non-null float64 30 windGust 125679 non-null float64 31 windSpeed 1223660 non-null float64 32 ID_ESTACION 1223660 non-null int64 dtypes: datetime64[ns](1), float64(31), int64(1) memory usage: 308.1 MB
We separate the validTimeUtc variable into "Year", "Month", "Day" and "Time" for future study.
METEO['Year'] = list(map(str,METEO.validTimeUtc.dt.year))
METEO['Month'] = list(map(str,METEO.validTimeUtc.dt.month))
METEO['Day'] = list(map(str,METEO.validTimeUtc.dt.day))
METEO['Time'] = list(METEO.validTimeUtc.dt.time)
We keep the columns we are interested in, i.e. those with hourly data.
columns = ['validTimeUtc', 'Year', 'Month', 'Day', 'Time', 'precip1Hour', 'pressureChange', 'pressureMeanSeaLevel', 'relativeHumidity',
'snow1Hour', 'temperature', 'temperatureChange24Hour', 'temperatureMax24Hour', 'temperatureMin24Hour', 'temperatureDewPoint',
'temperatureFeelsLike', 'uvIndex', 'visibility', 'windSpeed', 'windDirection', 'ID_ESTACION']
METEO = METEO[[i for i in METEO.columns if i in columns]].copy(deep = True)
METEO['uvIndex'].replace([-2,-1], np.nan, inplace = True)
METEO
| validTimeUtc | precip1Hour | pressureChange | pressureMeanSeaLevel | relativeHumidity | snow1Hour | temperature | temperatureChange24Hour | temperatureMax24Hour | temperatureMin24Hour | temperatureDewPoint | temperatureFeelsLike | uvIndex | visibility | windDirection | windSpeed | ID_ESTACION | Year | Month | Day | Time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015-06-29 16:20:00 | 0.0 | -1.4 | NaN | 27.0 | 0.0 | 34.5 | -1.1 | 36.3 | 17.9 | 12.8 | 34.5 | 2.0 | 16.09 | NaN | 18.7 | 13 | 2015 | 6 | 29 | 16:20:00 |
| 1 | 2015-06-29 17:20:00 | 0.0 | -1.0 | NaN | 26.3 | 0.0 | 34.3 | -0.4 | 35.0 | 17.9 | 12.3 | 34.3 | 1.0 | 16.09 | NaN | 18.0 | 13 | 2015 | 6 | 29 | 17:20:00 |
| 2 | 2015-06-29 18:20:00 | 0.0 | -0.3 | NaN | 29.0 | 0.0 | 32.8 | 0.1 | 34.7 | 17.9 | 12.4 | 32.8 | 0.0 | 16.09 | NaN | 16.6 | 13 | 2015 | 6 | 29 | 18:20:00 |
| 3 | 2015-06-29 19:20:00 | 0.0 | 0.3 | NaN | 33.2 | 0.0 | 31.0 | 0.3 | 34.7 | 17.9 | 12.9 | 31.0 | 0.0 | 16.09 | NaN | 15.1 | 13 | 2015 | 6 | 29 | 19:20:00 |
| 4 | 2015-06-29 20:20:00 | 0.0 | 0.9 | NaN | 43.1 | 0.0 | 27.6 | 0.0 | 34.7 | 17.9 | 13.9 | 28.0 | 0.0 | 16.09 | NaN | 10.1 | 13 | 2015 | 6 | 29 | 20:20:00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1223655 | 2022-06-30 19:20:00 | 0.0 | 2.7 | 1011.9 | 46.7 | 0.0 | 25.1 | 0.1 | 32.3 | 15.1 | 12.9 | 25.1 | 0.0 | 13.55 | 110.0 | 10.8 | 8 | 2022 | 6 | 30 | 19:20:00 |
| 1223656 | 2022-06-30 20:20:00 | 0.0 | 3.7 | 1013.1 | 57.1 | 0.0 | 22.9 | 0.2 | 32.3 | 15.1 | 13.9 | 22.9 | 0.0 | 13.59 | 80.0 | 9.0 | 8 | 2022 | 6 | 30 | 20:20:00 |
| 1223657 | 2022-06-30 21:20:00 | 0.0 | 3.8 | 1014.4 | 65.7 | 0.0 | 21.0 | 0.1 | 32.3 | 15.1 | 14.4 | 21.0 | 0.0 | 13.85 | 80.0 | 7.9 | 8 | 2022 | 6 | 30 | 21:20:00 |
| 1223658 | 2022-06-30 22:20:00 | 0.0 | 3.1 | 1015.1 | 75.5 | 0.0 | 19.9 | 0.1 | 32.3 | 15.1 | 15.4 | 19.9 | 0.0 | 13.17 | 50.0 | 8.3 | 8 | 2022 | 6 | 30 | 22:20:00 |
| 1223659 | 2022-06-30 23:20:00 | 0.0 | 2.0 | 1015.3 | 90.9 | 0.0 | 18.7 | -0.2 | 32.3 | 15.1 | 17.2 | 18.7 | 0.0 | 9.51 | 50.0 | 10.4 | 8 | 2022 | 6 | 30 | 23:20:00 |
1223660 rows × 21 columns
METEO.describe()
| validTimeUtc | precip1Hour | pressureChange | pressureMeanSeaLevel | relativeHumidity | snow1Hour | temperature | temperatureChange24Hour | temperatureMax24Hour | temperatureMin24Hour | temperatureDewPoint | temperatureFeelsLike | uvIndex | visibility | windDirection | windSpeed | ID_ESTACION | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1223660 | 1.223640e+06 | 1.223640e+06 | 867520.000000 | 1.223660e+06 | 1.223640e+06 | 1.223640e+06 | 1.223280e+06 | 1.223560e+06 | 1.223560e+06 | 1.223640e+06 | 1.223640e+06 | 1.223640e+06 | 1.223640e+06 | 867520.000000 | 1.223660e+06 | 1.223660e+06 |
| mean | 2019-01-01 13:32:01.906412032 | 6.657947e-02 | -4.230329e-03 | 1017.788293 | 6.691634e+01 | 7.397601e-04 | 1.542311e+01 | -4.719181e-02 | 2.173558e+01 | 1.006881e+01 | 8.223297e+00 | 1.474654e+01 | 1.354013e+00 | 1.286694e+01 | 184.346724 | 1.049545e+01 | 9.500000e+00 |
| min | 2015-06-29 16:20:00 | 0.000000e+00 | -6.200000e+00 | 988.600000 | 6.500000e+00 | 0.000000e+00 | -7.900000e+00 | -1.040000e+01 | 1.000000e-01 | -7.900000e+00 | -1.760000e+01 | -1.130000e+01 | 0.000000e+00 | 1.000000e-01 | 0.000000 | 0.000000e+00 | 0.000000e+00 |
| 25% | 2017-04-03 11:20:00 | 0.000000e+00 | -8.000000e-01 | 1014.000000 | 4.960000e+01 | 0.000000e+00 | 9.300000e+00 | -8.000000e-01 | 1.510000e+01 | 5.300000e+00 | 4.100000e+00 | 7.900000e+00 | 0.000000e+00 | 1.211000e+01 | 70.000000 | 5.400000e+00 | 4.750000e+00 |
| 50% | 2019-01-02 12:20:00 | 0.000000e+00 | 0.000000e+00 | 1017.500000 | 6.840000e+01 | 0.000000e+00 | 1.470000e+01 | 1.000000e-01 | 2.100000e+01 | 9.700000e+00 | 8.200000e+00 | 1.420000e+01 | 0.000000e+00 | 1.348000e+01 | 210.000000 | 8.600000e+00 | 9.500000e+00 |
| 75% | 2020-10-01 09:20:00 | 0.000000e+00 | 8.000000e-01 | 1021.800000 | 8.600000e+01 | 0.000000e+00 | 2.090000e+01 | 8.000000e-01 | 2.840000e+01 | 1.530000e+01 | 1.290000e+01 | 2.090000e+01 | 2.000000e+00 | 1.458000e+01 | 270.000000 | 1.400000e+01 | 1.425000e+01 |
| max | 2022-06-30 23:20:00 | 6.030000e+01 | 5.700000e+00 | 1037.000000 | 1.000000e+02 | 5.000000e+00 | 4.320000e+01 | 8.900000e+00 | 4.320000e+01 | 2.800000e+01 | 2.330000e+01 | 4.320000e+01 | 1.000000e+01 | 1.609000e+01 | 350.000000 | 5.870000e+01 | 1.900000e+01 |
| std | NaN | 5.892155e-01 | 1.129044e+00 | 6.363315 | 2.212223e+01 | 3.520595e-02 | 8.007426e+00 | 1.509542e+00 | 7.943553e+00 | 6.080957e+00 | 6.036721e+00 | 8.808097e+00 | 2.354698e+00 | 2.875003e+00 | 101.357652 | 6.858221e+00 | 5.766284e+00 |
columns_with_missing_data_METEO = METEO.isnull().sum()[METEO.isnull().sum() > 0].index
METEO.groupby('Year')[columns_with_missing_data_METEO].apply(lambda x: x.isnull().sum())
| precip1Hour | pressureChange | pressureMeanSeaLevel | snow1Hour | temperature | temperatureChange24Hour | temperatureMax24Hour | temperatureMin24Hour | temperatureDewPoint | temperatureFeelsLike | uvIndex | visibility | windDirection | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | |||||||||||||
| 2015 | 0 | 0 | 87760 | 0 | 0 | 140 | 100 | 100 | 0 | 0 | 0 | 0 | 87760 |
| 2016 | 20 | 20 | 174020 | 20 | 20 | 20 | 0 | 0 | 20 | 20 | 20 | 20 | 174020 |
| 2017 | 0 | 0 | 94360 | 0 | 0 | 100 | 0 | 0 | 0 | 0 | 0 | 0 | 94360 |
| 2018 | 0 | 0 | 0 | 0 | 0 | 60 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2019 | 0 | 0 | 0 | 0 | 0 | 60 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2020 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2021 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2022 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
sns.heatmap(METEO[columns_with_missing_data_METEO].isnull(), cbar=False);
fig, ax = plt.subplots(figsize=(28, 10))
g = sns.barplot(y=METEO[columns_with_missing_data_METEO].columns, x=METEO[columns_with_missing_data_METEO].isnull().sum(), orient = 'h', palette = 'Blues')
sns.set_theme(style="white")
g = sns.PairGrid(METEO, diag_sharey=False)
g.map_upper(sns.scatterplot, s=15)
g.map_lower(sns.scatterplot, s=15)
g.map_diag(sns.kdeplot, lw=2)
plt.show()